# -*- coding:utf-8 -*-

# @Time    : 2018/10/30 4:26 PM

# @Author  : Swing


# 统计训练集和测试集中有多少event
uniqueEvents = set()

for filename in ["train.csv", "test.csv"]:
    f = open(filename, 'rb')

    # 忽略第一行（列名字）
    f.readline().strip().split(b',')

    for line in f:  # 对每条记录
        cols = line.strip().split(b',')
        uniqueEvents.add(cols[1])  # 第二列为活动ID
    f.close()

n_uniqueEvents = len(uniqueEvents)

print("number of uniqueEvents :%d" % n_uniqueEvents)

# 从event.csv中读取用到的event写入一个新的文件
fo = open('events.csv', 'rb')
# 追加模式打开文件，不存在则创建新文件
fp = open('usefulEvents.csv', 'ab')

# 写入标题
titles = fo.readline()
fp.write(titles)

for line in fo:
    event_no = line.strip().split(b',')[0]
    if event_no in uniqueEvents:
        fp.write(line)
        uniqueEvents.remove(event_no)
        print('Write event id: ', str(event_no, encoding='UTF-8'), ', Remaining: ', len(uniqueEvents))

fp.close()
fo.close()
